Data Exploration for five New York Times datasets

nyt2 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt2.csv')
nyt3 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt3.csv')
nyt4 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt4.csv')
nyt5 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt5.csv')
nyt6 <- read.csv('C:/Users/kausha2/Documents/Data Analytics/Assignments/dds_ch2_nyt/nyt6.csv')

Boxplots of variables

Boxplots for the Age Variable

p1 <- ggplot(nyt2, aes(factor(Gender), Age)) + geom_boxplot(fill = "blue")

p2 <- ggplot(nyt3, aes(factor(Gender), Age)) + geom_boxplot(fill = "red")

p3 <- ggplot(nyt4, aes(factor(Gender), Age)) + geom_boxplot(fill = "yellow")

p4 <- ggplot(nyt5, aes(factor(Gender), Age)) + geom_boxplot(fill = "green")

p5 <- ggplot(nyt6, aes(factor(Gender), Age)) + geom_boxplot(fill = "orange")

grid.arrange(p1, p2, p3, p4, p4, nrow = 3, ncol = 2)

Boxplot of Impressions in terms of gender.

p1 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "blue")

p2 <- ggplot(nyt3, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "red")

p3 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "green")

p4 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "yellow")

p5 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(fill = "orange")

grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)

Summary 1: In the study conducted by NY Times, the mean age of women is lower than men. This is true across all the five datasets.

Summary 2: For the impressions variables, across the 5 datasets, the means have more or less the same value, i.e. the mean is about 5 impressions for both men and women across all the five datasets.


Histogram for Age plotted against Gender

p1 <- ggplot(nyt2, aes(Age)) + geom_histogram(fill = "blue", binwidth = 5)

p2 <- ggplot(nyt3, aes(Age)) + geom_histogram(fill = "red", binwidth = 5)

p3 <- ggplot(nyt4, aes(Age)) + geom_histogram(fill = "yellow", binwidth = 5)

p4 <- ggplot(nyt5, aes(Age)) + geom_histogram(fill = "green", binwidth = 5)

p5 <- ggplot(nyt6, aes(Age)) + geom_histogram(fill = "orange", binwidth = 5)

grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)

Observation : There are a lot of missing values, therefore we plot the histograms for values greater than zero

p1 <- ggplot(nyt2, aes(Age)) + geom_histogram(fill = "blue", binwidth = 5) + xlim(c(5,100))

p2 <- ggplot(nyt3, aes(Age)) + geom_histogram(fill = "red", binwidth = 5) + xlim(c(5,100))

p3 <- ggplot(nyt4, aes(Age)) + geom_histogram(fill = "yellow", binwidth = 5) + xlim(c(5,100))

p4 <- ggplot(nyt5, aes(Age)) + geom_histogram(fill = "green", binwidth = 5) + xlim(c(5,100))

p5 <- ggplot(nyt6, aes(Age)) + geom_histogram(fill = "orange", binwidth = 5) + xlim(c(5,100))

grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)

Histogram for Impressions

p1 <- ggplot(nyt2, aes(Impressions)) + geom_histogram(fill = "blue", color = "red", binwidth = 1) + xlim(c(-1,16))

p2 <- ggplot(nyt3, aes(Impressions)) + geom_histogram(fill = "red", color = "blue", binwidth = 1) + xlim(c(-1,16))

p3 <- ggplot(nyt4, aes(Impressions)) + geom_histogram(fill = "yellow", color = "green", binwidth = 1) + xlim(c(-1,16))

p4 <- ggplot(nyt5, aes(Impressions)) + geom_histogram(fill = "orange", color = "black", binwidth = 1) + xlim(c(-1,16))

p5 <- ggplot(nyt6, aes(Impressions)) + geom_histogram(fill = "green", color = "yellow", binwidth = 1) + xlim(c(-1,16))

grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)

Summary : There are a lot of missing values in the Age variable. Thus the distributions are slightly modified to show only values greater than zero. Thus it can be seen that, the age variable has a slight positive skew. The same can be said about the distribution of the Impressions variable which has a slight positive skew.

Empirical CDFs and Quantile-Quantile Plots


p1 <- ggplot(nyt2, aes(Age)) + stat_ecdf(geom = "step", color = "blue")

p2 <- ggplot(nyt3, aes(Age)) + stat_ecdf(geom = "step", color = "red")

p3 <- ggplot(nyt4, aes(Age)) + stat_ecdf(geom = "step", color = "darkgreen")

p4 <- ggplot(nyt5, aes(Age)) + stat_ecdf(geom = "step", color = "darkblue")

p5 <- ggplot(nyt6, aes(Age)) + stat_ecdf(geom = "step", color = "orange")

grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)

p1 <- ggplot(nyt2, aes(Impressions)) + stat_ecdf(geom = "step", color = "blue")

p2 <- ggplot(nyt3, aes(Impressions)) + stat_ecdf(geom = "step", color = "red")

p3 <- ggplot(nyt4, aes(Impressions)) + stat_ecdf(geom = "step", color = "darkgreen")

p4 <- ggplot(nyt5, aes(Impressions)) + stat_ecdf(geom = "step", color = "darkblue")

p5 <- ggplot(nyt6, aes(Impressions)) + stat_ecdf(geom = "step", color = "orange")

grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)

Quantile Quantile Plots

p1 <- ggplot(nyt2, aes(sample = Age)) + stat_qq(colour = "blue")
p2 <- ggplot(nyt3, aes(sample = Age)) + stat_qq(colour = "red")
p3 <- ggplot(nyt4, aes(sample = Age)) + stat_qq(colour = "yellow")
p4 <- ggplot(nyt5, aes(sample = Age)) + stat_qq(colour = "orange")
p5 <- ggplot(nyt6, aes(sample = Age)) + stat_qq(colour = "orange")

grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)

p1 <- ggplot(nyt2, aes(sample = Impressions)) + stat_qq(colour = "blue")
p2 <- ggplot(nyt3, aes(sample = Impressions)) + stat_qq(colour = "red")
p3 <- ggplot(nyt4, aes(sample = Impressions)) + stat_qq(colour = "yellow")
p4 <- ggplot(nyt5, aes(sample = Impressions)) + stat_qq(colour = "orange")
p5 <- ggplot(nyt6, aes(sample = Impressions)) + stat_qq(colour = "orange")

grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol = 2)

Summary : Looking at the ecdfs of Impressions we see that all the values are centered around 5 which is likely the median. The quantile quantile plots also follow a near straight line and hence we can conclude that the distribution is normal between 0 - 10. The same can be said about the Age variable, which initially has a bunch of missing values, but it follows an almost normal distribution between 20 to 60.

To check for normality we need to perform more tests such as the one below

Significance Testing

To check if the variables are significant, we first need to check that they follow a normal distribution. Only then can we conclude anything about significance.

We first define our hypotheses :

  • Null Hypothesis : The variables follow a Normal Distribution

  • Alternative Hypothesis : The variables do not follow a Normal Distribution

We use the Anderson - Darling test to check for normality

# We first check for normality

	Anderson-Darling normality test

data:  nyt2$Age
A = 1263.1, p-value < 2.2e-16
	Anderson-Darling normality test

data:  nyt2$Impressions
A = 3515, p-value < 2.2e-16

	Anderson-Darling normality test

data:  nyt3$Age
A = 1233.8, p-value < 2.2e-16
	Anderson-Darling normality test

data:  nyt3$Impressions
A = 3502.7, p-value < 2.2e-16

	Anderson-Darling normality test

data:  nyt4$Age
A = 12645, p-value < 2.2e-16
	Anderson-Darling normality test

data:  nyt4$Impressions
A = 4614.8, p-value < 2.2e-16

	Anderson-Darling normality test

data:  nyt5$Age
A = 10541, p-value < 2.2e-16
	Anderson-Darling normality test

data:  nyt5$Impressions
A = 3843.1, p-value < 2.2e-16

	Anderson-Darling normality test

data:  nyt6$Age
A = 21752, p-value < 2.2e-16
	Anderson-Darling normality test

data:  nyt6$Impressions
A = 8007.3, p-value < 2.2e-16

Summary : The p- values are too low, thus we can conclude that the distributions are NOT normal by rejecting the null hypothesis. We cannot use an F-Test here because the variables do not follow a normal distribution.

Taking it a step further : Plotting just for nyt2, we can see if any relationships exist between age and impressions

ggplot(nyt2, aes(x = Age, y = Impressions, fill = factor(Gender))) + geom_point()

Summary : From the above graph we see that there is no significant relationship between the Age and Impressions variables.

Filtering the datasets to create better visualizations

# The Impressions variable has a lot of missing values and filtering out the missing values would be helpful
nyt2<-nyt2[which(nyt2$Impressions>0 & nyt2$Age>0),] # Selecting only the values of Impressions > 0 and Age > 0
nyt3<-nyt3[which(nyt3$Impressions>0 & nyt3$Age>0),] # Selecting only the values of Impressions > 0 and Age > 0

More boxplots

Filtering the Age variable according to Gender and Clicks

p1 <- ggplot(nyt2, aes(factor(Gender), Age)) + geom_boxplot(aes(fill = factor(Clicks)))

p2 <- ggplot(nyt3, aes(factor(Gender), Age)) + geom_boxplot(aes(fill = factor(Clicks)))

grid.arrange(p1, p2, nrow = 2)

Filtering the Impressions variable according to Gender and Clicks

p1 <- ggplot(nyt2, aes(factor(Gender), Impressions)) + geom_boxplot(aes(fill = factor(Clicks)))

p2 <- ggplot(nyt3, aes(factor(Gender), Impressions)) + geom_boxplot(aes(fill = factor(Clicks)))

grid.arrange(p1, p2, nrow = 2)

Summary : We can conclude that both men and women have more or less the same number of clicks between 0-2 clicks. However, men tend to have more number of clicks as even the means are higher for clicks more than 3 in the case of nyt2 and 4 clicks in the case of nyt3

More Histograms

Filtering the histogram for Age with Gender

p1 <- ggplot(nyt2, aes(Age)) + geom_histogram(aes(fill = factor(Gender)), binwidth = 5)

p2 <- ggplot(nyt3, aes(Age)) + geom_histogram(aes(fill = factor(Gender)), binwidth = 5)

grid.arrange(p1, p2, nrow = 2)

Filtering the Impressions variable according to Gender

p1 <- ggplot(nyt2, aes(Impressions)) + geom_histogram(aes(fill = factor(Gender)), binwidth = 1)

p2 <- ggplot(nyt3, aes(Impressions)) + geom_histogram(aes(fill = factor(Gender)), binwidth = 1)

grid.arrange(p1, p2, nrow = 2)

Summary : We can conclude from the above positively skewed histogram that women have more impressions in general than do men.

More ECDFs and Quantile-Quantile plots

p1 <- ggplot(data = nyt2, aes(nyt2$Age,group = nyt2$Gender,colour = nyt2$Gender)) + stat_ecdf(geom = "step")

p2 <- ggplot(data = nyt3, aes(nyt3$Age,group = nyt3$Gender, colour = nyt3$Gender)) + stat_ecdf(geom = "step")

p3 <- ggplot(data = nyt2, aes(nyt2$Impressions,group = nyt2$Gender,colour = nyt2$Gender)) + stat_ecdf(geom = "step")

p4 <- ggplot(data = nyt3, aes(nyt3$Impressions,group = nyt3$Gender,colour = nyt3$Gender)) + stat_ecdf(geom = "step")

grid.arrange(p1, p2, p3, p4, nrow = 2,  ncol = 2)

p1 <- ggplot(data = nyt2, aes(sample=nyt2$Age,group = nyt2$Gender,colour = nyt2$Gender)) + stat_qq()

p2 <- ggplot(data = nyt3, aes(sample=nyt3$Age,group = nyt3$Gender,colour = nyt3$Gender)) + stat_qq()

p3 <- ggplot(data = nyt2, aes(sample=nyt2$Impressions,group = nyt2$Gender,colour = nyt2$Gender)) + stat_qq()

p4 <- ggplot(data = nyt3, aes(sample=nyt3$Impressions, group = nyt3$Gender, colour = nyt3$Gender)) + stat_qq()

grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

Summary : The ECDFs for age and Impressions show linear graphs after applying the required filters. The Quantile Quantile plots for Age and Impressions shows a linear graph after applying the required filters. It can be concluded for all graphs that both males and females have the same type of graph. The lower end (zeroes) represents females and the upper end (ones) represents the males in the dataset.

To check for normailty we need to perform more tests.

More Significance Tests

To check if the variables are significant, we first need to check that they follow a normal distribution. Only then can we conclude anything about significance.

We first define our hypotheses :

  • Null Hypothesis : The variables follow a Normal Distribution

  • Alternative Hypothesis : The variables do not follow a Normal Distribution

We use the Anderson - Darling test to check for normality

Summary : The p- values are too low, thus we can conclude that the distributions are NOT normal by rejecting the null hypothesis. We cannot use an F-Test here because the variables do not follow a normal distribution.

                                         End of Project